{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "7ad7a029-0cb0-469a-9812-926949d4692c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from tqdm import tqdm_notebook, tqdm\n",
    "from collections import Counter\n",
    "\n",
    "uid = pd.read_csv('uid_click_log.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "17077775-9529-46ec-8e13-8fea9c8f199e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uid</th>\n",
       "      <th>vid</th>\n",
       "      <th>cid</th>\n",
       "      <th>playtime</th>\n",
       "      <th>duration</th>\n",
       "      <th>date</th>\n",
       "      <th>rank</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100000</td>\n",
       "      <td>100870</td>\n",
       "      <td>100037</td>\n",
       "      <td>4966</td>\n",
       "      <td>9590</td>\n",
       "      <td>20200706</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>100000</td>\n",
       "      <td>101167</td>\n",
       "      <td>100024</td>\n",
       "      <td>5037</td>\n",
       "      <td>4949</td>\n",
       "      <td>20200706</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>100000</td>\n",
       "      <td>103608</td>\n",
       "      <td>100008</td>\n",
       "      <td>5137</td>\n",
       "      <td>14884</td>\n",
       "      <td>20200706</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>100000</td>\n",
       "      <td>100220</td>\n",
       "      <td>100084</td>\n",
       "      <td>5139</td>\n",
       "      <td>16784</td>\n",
       "      <td>20200706</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>100000</td>\n",
       "      <td>101674</td>\n",
       "      <td>100027</td>\n",
       "      <td>5149</td>\n",
       "      <td>5760</td>\n",
       "      <td>20200706</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      uid     vid     cid  playtime  duration      date  rank\n",
       "0  100000  100870  100037      4966      9590  20200706     1\n",
       "1  100000  101167  100024      5037      4949  20200706     2\n",
       "2  100000  103608  100008      5137     14884  20200706     3\n",
       "3  100000  100220  100084      5139     16784  20200706     4\n",
       "4  100000  101674  100027      5149      5760  20200706     5"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uid.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "19ce9314-f0ae-46a3-a331-4e42fe2dbc7d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uid</th>\n",
       "      <th>vid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100000</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>100000</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>100000</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>100000</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>100000</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25190</th>\n",
       "      <td>105998</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25191</th>\n",
       "      <td>105998</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25192</th>\n",
       "      <td>105998</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25193</th>\n",
       "      <td>105998</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25194</th>\n",
       "      <td>105998</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>25195 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          uid  vid\n",
       "0      100000    1\n",
       "1      100000    2\n",
       "2      100000    3\n",
       "3      100000    4\n",
       "4      100000    5\n",
       "...       ...  ...\n",
       "25190  105998    1\n",
       "25191  105998    2\n",
       "25192  105998    3\n",
       "25193  105998    4\n",
       "25194  105998    5\n",
       "\n",
       "[25195 rows x 2 columns]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example = pd.read_csv('example.csv')\n",
    "example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "31740fb5-60d2-4fb6-ac1e-f9d9be96265f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'a': 4, 'b': 5, 'c': 4, 'd': 5}\n"
     ]
    }
   ],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "def merge_dicts_by_adding_values(*dicts):\n",
    "    merged_dict = defaultdict(int)\n",
    "    for d in dicts:\n",
    "        for key, value in d.items():\n",
    "            merged_dict[key] += value\n",
    "    return dict(merged_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "bcdd3709-fe79-4fd6-84dc-458c47b83724",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6000, 9992)"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uid['uid'].nunique(), uid['vid'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "4ed085d8-6f1b-4e4a-bbfb-6641eb5b3736",
   "metadata": {},
   "outputs": [],
   "source": [
    "val_data = uid.groupby('uid').tail(5)\n",
    "train_data = uid[~uid.index.isin(val_data.index)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "dd0499f0-7909-4dd9-92f6-8f728e0236d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 6000/6000 [00:02<00:00, 2254.47it/s]\n"
     ]
    }
   ],
   "source": [
    "freq = {}\n",
    "for u, df in tqdm(train_data.groupby('uid')):\n",
    "    for v1, v2 in zip(df['vid'].values[:-1], df['vid'].values[1:]):\n",
    "        if v1 not in freq:\n",
    "            freq[v1] = {v2:1}\n",
    "        else:\n",
    "            if v2 in freq[v1]:\n",
    "                freq[v1][v2] += 1\n",
    "            else:\n",
    "                freq[v1][v2] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "83cedc07-89bd-4dff-a908-72acd722b3e9",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 6000/6000 [00:36<00:00, 165.99it/s]\n"
     ]
    }
   ],
   "source": [
    "recall_top5 = 0\n",
    "for u, df in tqdm(train_data.iloc[:].groupby('uid')):\n",
    "    df10 = df.tail(3)\n",
    "    pred_freq = merge_dicts_by_adding_values(*[freq[x] for x in df10['vid'].values if x in freq])\n",
    "    pred_vids = [k for k, v in sorted(pred_freq.items(), key=lambda item: item[1])][::-1]\n",
    "    pred_vids_top5 = [x for x in pred_vids if x not in df['vid'].values][:5]\n",
    "    recall_top5 += int(len(set(pred_vids_top5) & set(val_data[val_data['uid'] == u]['vid'].values)) > 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "cf158672-4a31-4c66-94ce-5f084e0d6ff6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "619"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "recall_top5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "15620157-18d2-4669-be28-19b83264ccfb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 6000/6000 [00:02<00:00, 2022.89it/s]\n"
     ]
    }
   ],
   "source": [
    "submit = []\n",
    "for u, df in tqdm(uid.groupby('uid')):\n",
    "    if u not in example['uid'].values:\n",
    "        continue\n",
    "\n",
    "    df10 = df.tail(3)\n",
    "    pred_freq = merge_dicts_by_adding_values(*[freq[x] for x in df10['vid'].values if x in freq])\n",
    "    pred_vids = [k for k, v in sorted(pred_freq.items(), key=lambda item: item[1])][::-1]\n",
    "    pred_vids = pred_vids[:5]\n",
    "    # pred_vids_top5 = [x for x in pred_vids if x not in df['vid'].values][:5]\n",
    "\n",
    "    # if len(pred_vids_top5) <= 5:\n",
    "    #     pred_vids_top5 += [102477] * (5-len(pred_vids_top5))\n",
    "    for v in pred_vids_top5[:5]:\n",
    "        submit.append([u, v])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "a01cd1ad-f500-4ec3-b110-e8e4436d28a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "submit = pd.DataFrame(submit)\n",
    "submit.columns = ['uid', 'vid']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "0e9be07d-d120-450a-a317-dc5e6b2f99d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "submit.to_csv('s2.csv', index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7922745-86d9-427e-b1e3-7b5da9f7b901",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
